packages = c('ggstatsplot', 'tidyverse', 'corrplot','ggpubr', 'GGally', 'corrgram')
for (p in packages){
if(!require(p,character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}
exam <- read_csv('data/Exam_data.csv')
to be used as input parameter for project. i.e plot.type - whether users want to see as violin, boxplot or violin / boxplot type - parameter / non-parametric / robust / bayers confidence interval - range from 0 to 1 only show the key ones - the rest can leave as default references : https://indrajeetpatil.github.io/ggstatsplot/reference/ggbetweenstats.html
ggbetweenstats(
data = exam,
x = GENDER,
y = MATHS,
title = 'Comparison of Maths Scores by gender'
)
For comparing median
set.seed(12345)
ggbetweenstats(
data = exam,
x = GENDER,
y = MATHS,
type = 'np',
title = 'Comparison of Maths Scores by gender'
)
usually only want to show the significant ones.
p value smaller than 0.05
ggbetweenstats(
data = exam,
x = RACE,
y = MATHS,
type = 'np',
mean.ci = TRUE,
pairwise.comparisons = TRUE,
pairwise.display = 's',
p.adjust.method = 'fdr',
)
## Scatterplot
ggscatterstats(
data = exam,
x = ENGLISH,
y = MATHS,
label.var = ID,
label.expression = ENGLISH <30 & MATHS >75,
title = 'Relationship between English and Maths'
)
first run lm model using lm function
stats shown = parameter , t-stats , pvalue
wine <- read_csv('data/wine_quality.csv')
fa <- ggplot(data = wine, aes(x = `fixed acidity`)) + geom_histogram(bins = 20, colour = 'black', fill = 'light blue')
va <- ggplot(data = wine, aes(x = `volatile acidity`)) + geom_histogram(bins = 20, colour = 'black', fill = 'light blue')
ca <- ggplot(data = wine, aes(x = `citric acid`)) + geom_histogram(bins = 20, colour = 'black', fill = 'light blue')
rs <- ggplot(data = wine, aes(x = `residual sugar`)) + geom_histogram(bins = 20, colour = 'black', fill = 'light blue')
ch <- ggplot(data = wine, aes(x = `chlorides`)) + geom_histogram(bins = 20, colour = 'black', fill = 'light blue')
fsd <- ggplot(data = wine, aes(x = `free sulfur dioxide`)) + geom_histogram(bins = 20, colour = 'black', fill = 'light blue')
tsd <- ggplot(data = wine, aes(x = `total sulfur dioxide`)) + geom_histogram(bins = 20, colour = 'black', fill = 'light blue')
density <- ggplot(data = wine, aes(x = `density`)) + geom_histogram(bins = 20, colour = 'black', fill = 'light blue')
ph <- ggplot(data = wine, aes(x = `pH`)) + geom_histogram(bins = 20, colour = 'black', fill = 'light blue')
sul <- ggplot(data = wine, aes(x = `sulphates`)) + geom_histogram(bins = 20, colour = 'black', fill = 'light blue')
al <- ggplot(data = wine, aes(x = `alcohol`)) + geom_histogram(bins = 20, colour = 'black', fill = 'light blue')
ggarrange(fa,va,ca,rs,ch,fsd,tsd,density, ph, sul,al, ncol = 4, nrow = 3)
ggcorrmat from ggstatsplot - 3 things to watch for
colour - divergent colour
cross - indicates that the variables are not statistically significant
benefits - provide statistical test - if you’re interested in finding out how significant etc.
ggcorrmat(
data = wine,
cor.vars= c(1:11)
)
from the corrplot package
benefits : able to reorder correlation matrix - hclust, alphabetical, PCA, etc. https://cran.r-project.org/web/packages/corrplot/vignettes/corrplot-intro.html
BUT corrplot cannot understand dataframe - need to use cor function to output correlation matrix first
benefits - more useful if you want to do variable selection as compared to the ggstatsplot package as corrplot enables you to arrange the variables
wine.cor = cor(wine[,1:11]) # need to create correlation matrix first
corrplot(wine.cor,
method ='ellipse',
type = 'lower',
diag = FALSE, #words are not diag
tl.col = 'black' #text label colour
)
wine.cor = cor(wine[,1:11]) # need to create correlation matrix first
corrplot(wine.cor,
method ='color',
type = 'lower',
diag = FALSE, #words are not diag
tl.col = 'black', #text label colour
order = 'AOE'
)
ggpairs(data = wine,
columns = c(1:11),
size = 0.1)
## Warning in warn_if_args_exist(list(...)): Extra arguments: "size" are being
## ignored. If these are meant to be aesthetics, submit them using the 'mapping'
## variable within ggpairs with ggplot2::aes or ggplot2::aes_string.
corrgram(wine.cor,
order = TRUE,
upper.panel = panel.cor,
main = 'Correlation')
if you dont have alot of variables
consider if you need to scale it or standarise.
wh <- read_csv('data/WHData-2018.csv')
ideally want to have interactive heatmap as you usually have alot of variables
packages = c('heatmaply', 'readr', 'seriation', 'dendextend','parallelPlot')
for (p in packages){
if(!require(p,character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}
row.names(wh) <- wh$Country # country becomes row name
## Warning: Setting row names on a tibble is deprecated.
wh1 <- select(wh, c(3,7:12))
wh_matrix <- data.matrix(wh1)
avoid using red and green
avoid using diverging colours
heatmaply(normalize(wh_matrix),
Colv=NA,
seriate = "none",
colors = Blues,
k_row = 5, # 5 clusters
margins = c(NA, 200, 60, NA),
fontsize_row = 4,
fontsize_col = 5,
main = 'World Happiness Score and Variables by Country, 2018',
xlab = 'World Happiness Indicators',
ylab = 'Countries'
)
kmeans clustering cannot be aggregated upwards or downwards - need to use parallel coordinates
parallelPlot(wh)
histo <- rep(TRUE, ncol(wh)) #create histogram - and plot his over the plot, true refer to all the variables selected - actual function is rep.
parallelPlot(wh,
histoVisibility = histo)